Cases

The analysis of the case-only data

Brian Muchmore https://github.com/bmuchmore (GENYO)http://www.genyo.es/
December 08, 2018

NOTE

Given the same data, the following commands should yield almost identical results to the results being shown with any discrepancies being stochastic in nature. Some of these commands, however, can take a long time to run, so while we show the commands here as we originally ran them, results are often being read back from file. If you are trying to recapitulate these results using the exact data and code being used here and are running into problems or incongruitous results, please submit an issue, and we will address it as soon as possible.

Set-up

All data analysis was done on a desktop with 8 cores/16 threads (AMD Ryzen 7 1800x) and 32 GB of DDR4 memory. We begin by setting a “cores” variable for future use.


cores <- 8

This is a general purpose naming function that will be used to generate any paths seen below:


name <- function(path = "/home/brian/Desktop/flow", folder = "cases", file, type = "rds") {
  paste0(path, "/", folder, "/", file, ".", type)
}

Package Information

These are the packages I will be using.


library(PreciseDist)
library(readr)
library(future)
library(doFuture)
library(heatmaply)

This is the session info.


sessionInfo()

R version 3.4.4 (2018-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.1 LTS

Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets 
[7] methods   base     

other attached packages:
 [1] heatmaply_0.15.2       viridis_0.5.1         
 [3] viridisLite_0.3.0      plotly_4.7.1          
 [5] ggplot2_3.0.0          doFuture_0.6.0        
 [7] iterators_1.0.10       foreach_1.4.4         
 [9] future_1.9.0           readr_1.1.1           
[11] PreciseDist_0.0.0.9000

loaded via a namespace (and not attached):
  [1] R.utils_2.6.0           tidyselect_0.2.4       
  [3] htmlwidgets_1.2         TSP_1.1-6              
  [5] trimcluster_0.1-2       grid_3.4.4             
  [7] ranger_0.10.1           Rtsne_0.13             
  [9] munsell_0.5.0           codetools_0.2-15       
 [11] SNFtool_2.3.0           miniUI_0.1.1.1         
 [13] misc3d_0.8-4            withr_2.1.2            
 [15] colorspace_1.3-2        longitudinalData_2.4.1 
 [17] Boruta_6.0.0            knitr_1.20             
 [19] geometry_0.3-6          stats4_3.4.4           
 [21] robustbase_0.93-1       dtw_1.20-1             
 [23] dimRed_0.1.0            DistatisR_1.0          
 [25] listenv_0.7.0           radix_0.5.0.9001       
 [27] DistributionUtils_0.5-1 rprojroot_1.3-2        
 [29] locpol_0.7-0            ipred_0.9-6            
 [31] randomForest_4.6-14     gclus_1.3.1            
 [33] diptest_0.75-7          R6_2.2.2               
 [35] seriation_1.2-3         fields_9.6             
 [37] rpivotTable_0.3.0       flexmix_2.3-14         
 [39] manipulateWidget_0.10.0 DRR_0.0.3              
 [41] bitops_1.0-6            assertthat_0.2.0       
 [43] promises_1.0.1          networkD3_0.4          
 [45] SDMTools_1.1-221        scales_1.0.0           
 [47] nnet_7.3-12             mmtsne_0.1.0           
 [49] gtable_0.2.0            ddalpha_1.3.4          
 [51] globals_0.12.1          spam_2.2-0             
 [53] timeDate_3043.102       rlang_0.2.2            
 [55] CVST_0.2-2              RcppRoll_0.3.0         
 [57] profileModel_0.5-9      splines_3.4.4          
 [59] lazyeval_0.2.1          ModelMetrics_1.1.0     
 [61] princurve_2.1.0         trelliscopejs_0.1.14   
 [63] broom_0.5.0             checkmate_1.8.5        
 [65] heatmap.plus_1.3        rgl_0.99.16            
 [67] yaml_2.2.0              reshape2_1.4.3         
 [69] abind_1.4-5             threejs_0.3.1          
 [71] crosstalk_1.0.0         backports_1.1.3        
 [73] httpuv_1.4.4.2          caret_6.0-80           
 [75] tools_3.4.4             lava_1.6.2             
 [77] infer_0.3.1             gplots_3.0.1           
 [79] RColorBrewer_1.1-2      proxy_0.4-22           
 [81] BiocGenerics_0.24.0     analogue_0.17-0        
 [83] Rcpp_0.12.18            splus2R_1.2-2          
 [85] plyr_1.8.4              visNetwork_2.0.4       
 [87] base64enc_0.1-3         progress_1.2.0         
 [89] purrr_0.2.5             prettyunits_1.0.2      
 [91] rpart_4.1-13            diffusr_0.1.4          
 [93] zoo_1.8-3               sfsmisc_1.1-2          
 [95] cluster_2.0.7-1         magrittr_1.5           
 [97] data.table_1.11.4       TSclust_1.2.4          
 [99] mvtnorm_1.0-8           whisker_0.3-2          
[101] matrixStats_0.53.1      hms_0.4.2              
[103] NetPreProc_1.1          mime_0.6               
[105] evaluate_0.11           xtable_1.8-3           
[107] mclust_5.4.1            gridExtra_2.3          
[109] compiler_3.4.4          tibble_1.4.2           
[111] maps_3.3.0              mgc_1.0.1              
[113] KernSmooth_2.23-15      crayon_1.3.4           
[115] R.oo_1.22.0             htmltools_0.3.6        
[117] mgcv_1.8-23             later_0.7.3            
[119] tidyr_0.8.1             RcppParallel_4.4.1     
[121] lubridate_1.7.4         magic_1.5-8            
[123] fpc_2.1-11              autocogs_0.0.1         
[125] MASS_7.3-49             Matrix_1.2-14          
[127] permute_0.9-4           gdata_2.18.0           
[129] wmtsa_2.0-3             R.methodsS3_1.7.1      
[131] dotCall64_1.0-0         bindr_0.1.1            
[133] gower_0.1.2             igraph_1.2.1           
[135] ifultools_2.0-4         pkgconfig_2.0.1        
[137] registry_0.5            brglm_0.6.1            
[139] ExPosition_2.8.19       philentropy_0.2.0      
[141] microbenchmark_1.4-4    recipes_0.1.3          
[143] clv_0.3-2.1             webshot_0.5.0          
[145] prodlim_2018.04.18      LPStimeSeries_1.0-5    
[147] stringr_1.3.1           digest_0.6.18          
[149] pls_2.6-0               vegan_2.5-2            
[151] graph_1.56.0            rmarkdown_1.10         
[153] dendextend_1.8.0        uwot_0.0.0.9004        
[155] kernlab_0.9-26          gtools_3.8.1           
[157] modeltools_0.2-22       shiny_1.1.0            
[159] nlme_3.1-131            glasso_1.10            
[161] jsonlite_1.5            bindrcpp_0.2.2         
[163] alluvial_0.1-2          TSdist_3.4             
[165] pillar_1.3.0            lattice_0.20-35        
[167] httr_1.3.1              DEoptimR_1.0-8         
[169] survival_2.41-3         glue_1.3.0             
[171] xts_0.11-0              prabclus_2.2-6         
[173] class_7.3-14            stringi_1.2.3          
[175] pdc_1.0.3               KODAMA_1.5             
[177] rsample_0.0.2           caTools_1.17.1         
[179] dplyr_0.7.6             hglasso_1.2            

Distance Calculations

The first step is to take our flow case data and calculate as many distances, similarities and correlations as possible. Some of these calculations may take seconds while others may take hours, so we generally run this overnight and then kill the function if it is still running in the morning (if the file parameter is set, once a distance finishes it is written to file, so killing the function does not harm already written data).


library(future)
library(doFuture)
options(future.globals.maxSize = +Inf)
registerDoFuture()
plan(multicore, workers = cores)
flow_case_dists <- flow_cases %>%
  as.matrix() %>%
  precise_dist(
    dists = "all_dists",
    suffix = "",
    file = name(file = "flow_case_dists"),
    parallel = TRUE,
    local_timeout = Inf,
    verbose = TRUE
  )

Now we coerce all non-distances (i.e. similarities and correlations) into distances.


flow_case_distances <- flow_case_dists %>%
  precise_transform(enforce_dist = TRUE)

UMAP Results for Every Distance


library(future)
library(doFuture)
options(future.globals.maxSize = +Inf)
registerDoFuture()
plan(multicore, workers = 4)
flow_case_umap <- precise_umap(
  data = flow_case_distances,
  distance = TRUE,
  n_neighbors = 5,
  spread = 10,
  min_dist = 0.1,
  bandwidth = 1,
  type = "plotly",
  color_vec = NULL,
  colors = NULL,
  parallel = TRUE,
  verbose = TRUE
)

precise_trellis(
  data = flow_case_umap, 
  name = "UMAP of Every Flow Distance",
  path = name(file = "trellis_flow_case_umap"), 
  self_contained = TRUE
)

flow_case_distance_correlations <- precise_correlations(
  data = flow_case_distances,
  method = "pearson",
  permutations = 101,
  parallel = FALSE,
  verbose = TRUE
)

heatmaply_cor(flow_case_distance_correlations$statistic)

library(future)
library(doFuture)
options(future.globals.maxSize = +Inf)
registerDoFuture()
plan(multicore, workers = 5)
flow_case_soergel_dists <- flow_cases %>%
  as.matrix() %>%
  precise_dist(
    dists = "soergel",
    suffix = "",
    partitions = 10,
    file = name(file = "flow_case_soergel_dists"),
    parallel = TRUE,
    local_timeout = Inf,
    verbose = TRUE
  )

flow_case_soergel_fusion <- flow_case_soergel_dists %>%
  precise_transform(return_list = TRUE) %>%
  precise_transform(enforce_sim = TRUE) %>%
  precise_transform(fixed_k = 100) %>%
  precise_transform(transform = "laplacian") %>%
  precise_transform(enforce_dist = TRUE) %>%
  precise_fusion(fusion = "fuse")

library(future)
library(doFuture)
options(future.globals.maxSize = +Inf)
registerDoFuture()
plan(multicore, workers = 8)
flow_case_soergel_graph <- precise_graph(
  data = flow_case_soergel_fusion,
  method = 5,
  distance = FALSE,
  n_neighbors = 75,
  spread = 1,
  min_dist = 0.0,
  bandwidth = 1,
  parallel = TRUE,
  verbose = TRUE
)

flow_case_soergel_2d_plot <- precise_viz(
  data = flow_case_soergel_graph,
  plot_type = "drl_2d_plot",
  k = 50,
  jitter = 2.5,
  color_vec = NULL,
  colors = NULL,
  size = 0.5,
  graphml = NULL,
  html = NULL,
  verbose = TRUE
)

flow_case_soergel_2d_plot$visual_output

library(fpc)
library(purrr)
library(tibble)
flow_case_soergel_clusters <- flow_case_soergel_2d_plot$plot_layout %>%
  pamk(
    krange = 2:20,
    criterion = "asw", 
    usepam = TRUE,
    scaling = FALSE, 
    alpha = 0.001, 
    diss = FALSE,
    critout = FALSE, 
    ns = 10, 
    seed = NULL
  ) %>%
  .[[1]] %>%
  .[["clustering"]] %>%
  as.character() %>%
  map_chr(~paste0("cluster_", .x)) %>%
  as_tibble() %>%
  select(pam_clusters = value)

flow_case_soergel_descriptors <- precise_descriptors(
  flow_case_soergel_2d_plot,
  descriptors = cbind(flow_case_soergel_clusters, flow_cases),
  verbose = TRUE,
  rank = TRUE,
  size = 0.5
)

precise_trellis(
  data = flow_case_soergel_descriptors, 
  name = "Flow Columns and Cluster Results Mapped to Visual Results of Fused Soergel Distances",
  path = name(file = "trellis_flow_case_soergel_descriptors"), 
  self_contained = TRUE
)